package com.apobates.forum.core.tag.nlp;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.apobates.forum.core.tag.TagExtraction;
import com.apobates.forum.utils.Commons;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;

public class TagNLPExtraction implements TagExtraction{
    //允许的词数
    private final int allowSize;
    private final Set<WordNatureEnum> wordNatures;
    
    public TagNLPExtraction(int allowSize, WordNatureEnum... nature) {
        super();
        this.allowSize = allowSize;
        this.wordNatures = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(nature)));
    }
    
    @Override
    public LinkedHashMap<String, Integer> extract(String title, String content, String... keywords) { //20200717
        StringBuffer sb = new StringBuffer();
        if (Commons.isNotBlank(title)) {
            sb.append(Commons.htmlPurifier(title));
        }
        if (Commons.isNotBlank(content)) {
            sb.append(Commons.htmlPurifier(content));
        }
        if (sb.length() == 0) {
            return new LinkedHashMap<>();
        }
        //提取的标签
        TreeMap<String, Integer> tages = segment(sb.toString(), wordNatures);
        //参数中提供的标签
        TreeMap<String, Integer> extTages = Stream.of(keywords).collect(Collectors.toMap(Function.identity(), (s)->2, (oldValue, newValue) -> oldValue, ()->new TreeMap<>(String.CASE_INSENSITIVE_ORDER)));
        extTages.forEach((newKey, newVal)->{
            tages.merge(newKey, newVal, (oldValue, newValue)-> oldValue);
        });
        return tages.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())).limit(getAllowSize()).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (oldValue, newValue) -> oldValue, LinkedHashMap::new));
    }
    
    @Override
    public String getLineSeparator() {
        return "p";
    }
    
    public int getAllowSize() {
        return allowSize;
    }
    
    public Set<WordNatureEnum> getWordNatures() {
        return wordNatures;
    }
    //过滤的非法符号字符
    private List<String> stripCharacterSymbol() {
        List<String> data = new ArrayList<>(Arrays.asList("）", "（", "+", "-", "“", "”", "：", "？", ">", "<", "!", "！", "%", "％", "<p>", "p>", "p><p>"));
        data.add(getLineSeparator());
        return data;
    }
    
    /**
     * 使用开源HanLP NLP库分词
     *
     * @param content 待分词的内容
     * @param wordNatures 允许的词性
     * @return 返回值区分大小
     */
    public TreeMap<String, Integer> segment(String content, Set<WordNatureEnum> wordNatures) {
        if (!Commons.isNotBlank(content)) {
            return new TreeMap<>();
        }
        List<Term> data = HanLP.newSegment().seg(content);
        List<String> wn = wordNatures.stream().map(WordNatureEnum::getNature).collect(Collectors.toList());
        //20200616,区分大小写,例:
        TreeMap<String, Integer> result = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
        for (Term t : data) {
            String word = null, n = null;
            if (t.toString().contains("/")) {
                word = t.toString().substring(0, t.toString().indexOf("/")).trim().replaceAll("\\s+", "");
                n = t.toString().substring(t.toString().indexOf("/") + 1);
            }
            //过滤掉无用的,符号,字符
            List<String> illageSymbol = stripCharacterSymbol();
            if (null == word || illageSymbol.contains(word)) {
                continue;
            }
            //
            if (wn.isEmpty()) { //不限词性
                if (result.containsKey(word)) {
                    result.put(word, result.get(word) + 1);
                } else {
                    result.put(word, 1);
                }
            } else {
                if (null != n) {
                    //包含(相等或它的开头也出现了)
                    final String tmpNature = n;
                    boolean isAllow = wn.stream().filter(w -> tmpNature.startsWith(w)).count() >= 1;
                    if (wn.contains(n) || isAllow) {
                        if (result.containsKey(word)) {
                            result.put(word, result.get(word) + 1);
                        } else {
                            result.put(word, 1);
                        }
                    }
                }
            }
        }
        return result;
    }
}
